Importing libraries and our dataset¶

In [1]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt

df_train = pd.read_csv('../data/for_modelling/credit_score_train.csv')
df_val = pd.read_csv('../data/for_modelling/credit_score_valid.csv')
df=pd.concat([df_train, df_val]) 
#df=df_train.copy()
df.head()
Out[1]:
INCOME SAVINGS DEBT R_SAVINGS_INCOME R_DEBT_INCOME R_DEBT_SAVINGS T_CLOTHING_12 T_CLOTHING_6 R_CLOTHING R_CLOTHING_INCOME ... R_EXPENDITURE_SAVINGS R_EXPENDITURE_DEBT CAT_GAMBLING CAT_DEBT CAT_CREDIT_CARD CAT_MORTGAGE CAT_SAVINGS_ACCOUNT CAT_DEPENDENTS CREDIT_SCORE DEFAULT
0 2783 1855 0 0.6665 0.00 0.0000 103 74 0.7184 0.0370 ... 2.5003 0.0000 No 0 0 0 1 0 570 0
1 314430 445442 707468 1.4167 2.25 1.5882 35861 29157 0.8131 0.1141 ... 0.5882 0.3704 High 1 0 1 1 0 691 0
2 161773 517674 2782496 3.2000 17.20 5.3750 3716 2533 0.6816 0.0230 ... 0.3125 0.0581 No 1 1 1 1 1 520 0
3 16014 97685 20818 6.1000 1.30 0.2131 637 187 0.2936 0.0398 ... 0.1639 0.7692 No 1 0 0 1 0 654 0
4 193225 1410542 2589215 7.3000 13.40 1.8356 5276 2325 0.4407 0.0273 ... 0.1370 0.0746 No 1 1 0 1 1 552 0

5 rows × 86 columns

In [2]:
# let's set our target and look into it
target='DEFAULT'
df[target].value_counts()
Out[2]:
DEFAULT
0    450
1    178
Name: count, dtype: int64

Conclusions after EDA:¶

  • there are no missing values
  • there is only one categorical variable - 'CAT_GAMBLING' (we used ordinal encoding for this variable)
  • there are some outliers, but not so many (so we will delete them manually later)
  • there were 86 columns and we deleted only one - 'ID'; for now we decided to not delete any other columns

Missing values¶

In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 628 entries, 0 to 137
Data columns (total 86 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   INCOME                   628 non-null    int64  
 1   SAVINGS                  628 non-null    int64  
 2   DEBT                     628 non-null    int64  
 3   R_SAVINGS_INCOME         628 non-null    float64
 4   R_DEBT_INCOME            628 non-null    float64
 5   R_DEBT_SAVINGS           628 non-null    float64
 6   T_CLOTHING_12            628 non-null    int64  
 7   T_CLOTHING_6             628 non-null    int64  
 8   R_CLOTHING               628 non-null    float64
 9   R_CLOTHING_INCOME        628 non-null    float64
 10  R_CLOTHING_SAVINGS       628 non-null    float64
 11  R_CLOTHING_DEBT          628 non-null    float64
 12  T_EDUCATION_12           628 non-null    int64  
 13  T_EDUCATION_6            628 non-null    int64  
 14  R_EDUCATION              628 non-null    float64
 15  R_EDUCATION_INCOME       628 non-null    float64
 16  R_EDUCATION_SAVINGS      628 non-null    float64
 17  R_EDUCATION_DEBT         628 non-null    float64
 18  T_ENTERTAINMENT_12       628 non-null    int64  
 19  T_ENTERTAINMENT_6        628 non-null    int64  
 20  R_ENTERTAINMENT          628 non-null    float64
 21  R_ENTERTAINMENT_INCOME   628 non-null    float64
 22  R_ENTERTAINMENT_SAVINGS  628 non-null    float64
 23  R_ENTERTAINMENT_DEBT     628 non-null    float64
 24  T_FINES_12               628 non-null    int64  
 25  T_FINES_6                628 non-null    int64  
 26  R_FINES                  628 non-null    float64
 27  R_FINES_INCOME           628 non-null    float64
 28  R_FINES_SAVINGS          628 non-null    float64
 29  R_FINES_DEBT             628 non-null    float64
 30  T_GAMBLING_12            628 non-null    int64  
 31  T_GAMBLING_6             628 non-null    int64  
 32  R_GAMBLING               628 non-null    float64
 33  R_GAMBLING_INCOME        628 non-null    float64
 34  R_GAMBLING_SAVINGS       628 non-null    float64
 35  R_GAMBLING_DEBT          628 non-null    float64
 36  T_GROCERIES_12           628 non-null    int64  
 37  T_GROCERIES_6            628 non-null    int64  
 38  R_GROCERIES              628 non-null    float64
 39  R_GROCERIES_INCOME       628 non-null    float64
 40  R_GROCERIES_SAVINGS      628 non-null    float64
 41  R_GROCERIES_DEBT         628 non-null    float64
 42  T_HEALTH_12              628 non-null    int64  
 43  T_HEALTH_6               628 non-null    int64  
 44  R_HEALTH                 628 non-null    float64
 45  R_HEALTH_INCOME          628 non-null    float64
 46  R_HEALTH_SAVINGS         628 non-null    float64
 47  R_HEALTH_DEBT            628 non-null    float64
 48  T_HOUSING_12             628 non-null    int64  
 49  T_HOUSING_6              628 non-null    int64  
 50  R_HOUSING                628 non-null    float64
 51  R_HOUSING_INCOME         628 non-null    float64
 52  R_HOUSING_SAVINGS        628 non-null    float64
 53  R_HOUSING_DEBT           628 non-null    float64
 54  T_TAX_12                 628 non-null    int64  
 55  T_TAX_6                  628 non-null    int64  
 56  R_TAX                    628 non-null    float64
 57  R_TAX_INCOME             628 non-null    float64
 58  R_TAX_SAVINGS            628 non-null    float64
 59  R_TAX_DEBT               628 non-null    float64
 60  T_TRAVEL_12              628 non-null    int64  
 61  T_TRAVEL_6               628 non-null    int64  
 62  R_TRAVEL                 628 non-null    float64
 63  R_TRAVEL_INCOME          628 non-null    float64
 64  R_TRAVEL_SAVINGS         628 non-null    float64
 65  R_TRAVEL_DEBT            628 non-null    float64
 66  T_UTILITIES_12           628 non-null    int64  
 67  T_UTILITIES_6            628 non-null    int64  
 68  R_UTILITIES              628 non-null    float64
 69  R_UTILITIES_INCOME       628 non-null    float64
 70  R_UTILITIES_SAVINGS      628 non-null    float64
 71  R_UTILITIES_DEBT         628 non-null    float64
 72  T_EXPENDITURE_12         628 non-null    int64  
 73  T_EXPENDITURE_6          628 non-null    int64  
 74  R_EXPENDITURE            628 non-null    float64
 75  R_EXPENDITURE_INCOME     628 non-null    float64
 76  R_EXPENDITURE_SAVINGS    628 non-null    float64
 77  R_EXPENDITURE_DEBT       628 non-null    float64
 78  CAT_GAMBLING             628 non-null    object 
 79  CAT_DEBT                 628 non-null    int64  
 80  CAT_CREDIT_CARD          628 non-null    int64  
 81  CAT_MORTGAGE             628 non-null    int64  
 82  CAT_SAVINGS_ACCOUNT      628 non-null    int64  
 83  CAT_DEPENDENTS           628 non-null    int64  
 84  CREDIT_SCORE             628 non-null    int64  
 85  DEFAULT                  628 non-null    int64  
dtypes: float64(51), int64(34), object(1)
memory usage: 426.8+ KB

There are no missing values in our dataset :))

Categorical variables¶

In our dataset there is only one categorical variable - 'CAT_GAMBLING'.

In [4]:
df['CAT_GAMBLING'].value_counts()
Out[4]:
CAT_GAMBLING
No      392
High    163
Low      73
Name: count, dtype: int64

We will use ordinal encoding for this variable

In [5]:
df['CAT_GAMBLING'] = df['CAT_GAMBLING'].map({'No': 0, 'Low': 1,'High': 2})
df
Out[5]:
INCOME SAVINGS DEBT R_SAVINGS_INCOME R_DEBT_INCOME R_DEBT_SAVINGS T_CLOTHING_12 T_CLOTHING_6 R_CLOTHING R_CLOTHING_INCOME ... R_EXPENDITURE_SAVINGS R_EXPENDITURE_DEBT CAT_GAMBLING CAT_DEBT CAT_CREDIT_CARD CAT_MORTGAGE CAT_SAVINGS_ACCOUNT CAT_DEPENDENTS CREDIT_SCORE DEFAULT
0 2783 1855 0 0.6665 0.0000 0.0000 103 74 0.7184 0.0370 ... 2.5003 0.0000 0 0 0 0 1 0 570 0
1 314430 445442 707468 1.4167 2.2500 1.5882 35861 29157 0.8131 0.1141 ... 0.5882 0.3704 2 1 0 1 1 0 691 0
2 161773 517674 2782496 3.2000 17.2000 5.3750 3716 2533 0.6816 0.0230 ... 0.3125 0.0581 0 1 1 1 1 1 520 0
3 16014 97685 20818 6.1000 1.3000 0.2131 637 187 0.2936 0.0398 ... 0.1639 0.7692 0 1 0 0 1 0 654 0
4 193225 1410542 2589215 7.3000 13.4000 1.8356 5276 2325 0.4407 0.0273 ... 0.1370 0.0746 0 1 1 0 1 1 552 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
133 386976 1250231 654883 3.2308 1.6923 0.5238 36665 30569 0.8337 0.0947 ... 0.2381 0.4545 0 1 0 0 1 0 562 0
134 16772 8386 0 0.5000 0.0000 0.0000 238 186 0.7815 0.0142 ... 2.5000 0.9786 0 0 0 0 1 0 619 0
135 137509 206264 206264 1.5000 1.5000 1.0000 15661 10865 0.6938 0.1139 ... 0.6667 0.6667 0 1 0 0 1 0 639 1
136 259146 1922000 1662854 7.4167 6.4167 0.8652 12180 8527 0.7001 0.0470 ... 0.1124 0.1299 2 1 1 0 1 1 587 1
137 56657 442952 51506 7.8181 0.9091 0.1163 1085 289 0.2664 0.0192 ... 0.1163 1.0000 0 1 0 0 1 0 635 1

628 rows × 86 columns

Detecting outliers¶

In [6]:
#let's prepare dataset without 'DEFAULT', binary variables and 'Cat_gambling' because it was categorical
#we will use it often in the next steps
binary_cols = [col for col in df.columns if df[col].nunique() == 2]
rest_cols = [col for col in df.columns if col not in binary_cols]
rest_cols.remove('CAT_GAMBLING')

boxplot¶

In [7]:
# for col in rest_cols:
#     sns.boxplot(x=df[col])
#     plt.show()

There are too many columns in our dataset so boxplots won't be legible. We will try to detect outliers using other methods.

PyOD¶

In [8]:
from pyod.models.knn import KNN
clf = KNN(contamination=0.04)
clf.fit(df[rest_cols])
df['outliers'] = clf.labels_
df['outliers'].value_counts()
Out[8]:
outliers
0    602
1     26
Name: count, dtype: int64

This is automatic detection of outliers. We assume that 4% of our data is as outliers. (we checked different values and 4% seems to be the best) Let's check if we can find some outliers manually.

scatter plot¶

In [9]:
# we are showing scatterplots for all features except 'DEFAULT', binary variables and 'Cat_gambling' because it was categorical
df_without_binary = [col for col in df.columns if df[col].nunique() > 2]
columns_to_scatter_plot = [col for col in df_without_binary if col not in ['CREDIT_SCORE', 'CAT_GAMBLING', 'DEFAULT']]

fig, axs = plt.subplots(26, 3, figsize=(16, 100)) 
axs = axs.flatten()
for i, col in enumerate(columns_to_scatter_plot):
    sns.scatterplot(data=df, x=col, y="CREDIT_SCORE", ax=axs[i], s=15, hue='DEFAULT')

plt.tight_layout() 
plt.show()

Outliers:

  • 'T_CLOTHING_6' - above 37000 (there is only one observation)
  • 'R_CLOTHING_SAVINGS' - above 1.50 (there is only one observation)
  • 'R_CLOTHING_DEBT' - above 1.0 (there is only one observation)
  • 'R_EDUCATION_SAVINGS' - above 1.7 (there is only two observation)
  • 'R_EDUCATION_DEBT' - above 0.3 (there is only two observation)
  • 'R_ENTERAINMENT_INCOME' - above 1.2 (there is only two observation)
  • 'R_ENTERAINMENT_SAVINGS' - above 6 (there is only one observation)
  • 'R_ENTERAINMENT_DEBT' - above 2.0 (there is only two observation)
  • 'R_FINES_INCOME' - above 0.02 (there is only one observation)
  • 'R_FINES_SAVINGS' - above 0.05 (there is only one observation)
  • 'R_FINES_DEBT' - above 0.02 (there is only one observation)
  • 'R_GAMBLING_INCOME' - above 0.2 (there is only one observation)
  • 'R_GAMBLING_SAVINGS' - above 0.8 (there is only two observation)
  • 'R_GAMBLING_DEBT' - above 0.15 (there is only one observation)
  • 'R_GROCERIES_SAVINGS' - above 3.5 (there is only one observation)
  • 'T_HEALTH_12' - above 40000 (there is only one observation)
  • 't_health_6' - above 25000 (there is only two observation)
  • 'R_HEALTH_INCOME' - above 0.3 (there is only one observation)
  • 'R_HEALTH_SAVINGS' - above 0.8 (there is only one observation)
  • 'R_HOUSING_DEBT' - above 3 (there is only one observation)
  • 'R_TAX_DEBT' - above 0.15 (there is only two observation)

Let's remove these outliers

In [10]:
# Definiowanie granic dla outlierów
outliers_dict = {
    'T_CLOTHING_6': 37000,
    'R_CLOTHING_SAVINGS': 1.50,
    'R_CLOTHING_DEBT': 1.0,
    'R_EDUCATION_SAVINGS': 1.7,
    'R_EDUCATION_DEBT': 0.3,
    'R_ENTERTAINMENT_INCOME': 1.2,
    'R_ENTERTAINMENT_SAVINGS': 6,
    'R_ENTERTAINMENT_DEBT': 2.0,
    'R_FINES_INCOME': 0.02,
    'R_FINES_SAVINGS': 0.05,
    'R_FINES_DEBT': 0.02,
    'R_GAMBLING_INCOME': 0.2,
    'R_GAMBLING_SAVINGS': 0.8,
    'R_GAMBLING_DEBT': 0.15,
    'R_GROCERIES_SAVINGS': 3.5,
    'T_HEALTH_12': 40000,
    'T_HEALTH_6': 25000,
    'R_HEALTH_INCOME': 0.3,
    'R_HEALTH_SAVINGS': 0.8,
    'R_HOUSING_DEBT': 3,
    'R_TAX_DEBT': 0.15
}

for col, threshold in outliers_dict.items():
    df_without_outliers = df[df[col] <= threshold]

df_without_outliers.reset_index(drop=True, inplace=True)
df_without_outliers
Out[10]:
INCOME SAVINGS DEBT R_SAVINGS_INCOME R_DEBT_INCOME R_DEBT_SAVINGS T_CLOTHING_12 T_CLOTHING_6 R_CLOTHING R_CLOTHING_INCOME ... R_EXPENDITURE_DEBT CAT_GAMBLING CAT_DEBT CAT_CREDIT_CARD CAT_MORTGAGE CAT_SAVINGS_ACCOUNT CAT_DEPENDENTS CREDIT_SCORE DEFAULT outliers
0 2783 1855 0 0.6665 0.0000 0.0000 103 74 0.7184 0.0370 ... 0.0000 0 0 0 0 1 0 570 0 0
1 314430 445442 707468 1.4167 2.2500 1.5882 35861 29157 0.8131 0.1141 ... 0.3704 2 1 0 1 1 0 691 0 0
2 161773 517674 2782496 3.2000 17.2000 5.3750 3716 2533 0.6816 0.0230 ... 0.0581 0 1 1 1 1 1 520 0 0
3 16014 97685 20818 6.1000 1.3000 0.2131 637 187 0.2936 0.0398 ... 0.7692 0 1 0 0 1 0 654 0 0
4 193225 1410542 2589215 7.3000 13.4000 1.8356 5276 2325 0.4407 0.0273 ... 0.0746 0 1 1 0 1 1 552 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
621 386976 1250231 654883 3.2308 1.6923 0.5238 36665 30569 0.8337 0.0947 ... 0.4545 0 1 0 0 1 0 562 0 0
622 16772 8386 0 0.5000 0.0000 0.0000 238 186 0.7815 0.0142 ... 0.9786 0 0 0 0 1 0 619 0 0
623 137509 206264 206264 1.5000 1.5000 1.0000 15661 10865 0.6938 0.1139 ... 0.6667 0 1 0 0 1 0 639 1 0
624 259146 1922000 1662854 7.4167 6.4167 0.8652 12180 8527 0.7001 0.0470 ... 0.1299 2 1 1 0 1 1 587 1 1
625 56657 442952 51506 7.8181 0.9091 0.1163 1085 289 0.2664 0.0192 ... 1.0000 0 1 0 0 1 0 635 1 0

626 rows × 87 columns

Variables transformation¶

In [11]:
# histograms for our dataset
df[rest_cols].hist(bins=40, figsize=(20, 20))
plt.tight_layout()
plt.show()

Data transformation -Box Cox, standard

In [12]:
# use Box Cox transformation for rest_cols
from scipy.stats import boxcox
df_box_cox=df.copy()
for col in rest_cols:
    df_box_cox[col] = boxcox(df_box_cox[col]+1)[0]
df_box_cox.head()
Out[12]:
INCOME SAVINGS DEBT R_SAVINGS_INCOME R_DEBT_INCOME R_DEBT_SAVINGS T_CLOTHING_12 T_CLOTHING_6 R_CLOTHING R_CLOTHING_INCOME ... R_EXPENDITURE_DEBT CAT_GAMBLING CAT_DEBT CAT_CREDIT_CARD CAT_MORTGAGE CAT_SAVINGS_ACCOUNT CAT_DEPENDENTS CREDIT_SCORE DEFAULT outliers
0 43.459601 28.600123 0.000000 0.495605 0.000000 0.000000 9.924372 7.003166 0.690181 0.027768 ... 0.000000 0 0 0 0 1 0 8.383897e+07 0 0
1 241.416057 162.327132 94.552153 0.837915 1.265938 0.793731 70.869805 36.115755 0.777806 0.052375 ... 0.217763 2 1 0 1 1 0 1.506710e+08 0 0
2 190.490817 169.976872 131.672996 1.319896 3.469615 1.316598 34.737136 19.786169 0.655954 0.019159 ... 0.052655 0 1 1 1 1 1 6.339584e+07 0 0
3 82.849139 101.686796 39.199909 1.749466 0.875886 0.186041 19.314106 9.495216 0.288280 0.029270 ... 0.303274 0 1 0 0 1 0 1.274213e+08 0 0
4 202.960902 230.871110 129.412824 1.872176 3.142675 0.855484 38.876541 19.351044 0.429239 0.022012 ... 0.065827 0 1 1 0 1 1 7.603568e+07 0 0

5 rows × 87 columns

In [13]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

df_box_cox[rest_cols] = scaler.fit_transform(df_box_cox[rest_cols])

# histogram of rest_cols
df_box_cox[rest_cols].hist(bins=40, figsize=(20, 20))
plt.tight_layout()
plt.show()

box cox transformation without outliers

In [14]:
# use Box Cox transformation for rest_cols
from scipy.stats import boxcox
df_box_cox_woo=df_without_outliers.copy()
for col in rest_cols:
    df_box_cox_woo[col] = boxcox(df_box_cox_woo[col]+1)[0]
df_box_cox_woo.head()
Out[14]:
INCOME SAVINGS DEBT R_SAVINGS_INCOME R_DEBT_INCOME R_DEBT_SAVINGS T_CLOTHING_12 T_CLOTHING_6 R_CLOTHING R_CLOTHING_INCOME ... R_EXPENDITURE_DEBT CAT_GAMBLING CAT_DEBT CAT_CREDIT_CARD CAT_MORTGAGE CAT_SAVINGS_ACCOUNT CAT_DEPENDENTS CREDIT_SCORE DEFAULT outliers
0 43.539514 28.811891 0.000000 0.496158 0.000000 0.000000 9.911315 7.005131 0.693785 0.027736 ... 0.000000 0 0 0 0 1 0 8.252999e+07 0 0
1 242.205425 164.691595 95.550238 0.839518 1.266955 0.794490 70.620484 36.143759 0.782303 0.052229 ... 0.216475 2 1 0 1 1 0 1.482440e+08 0 0
2 191.073948 172.488174 133.240307 1.323959 3.476710 1.318901 34.646531 19.797235 0.659232 0.019144 ... 0.052592 0 1 1 1 1 1 6.242099e+07 0 0
3 83.043991 102.956985 39.484341 1.756750 0.876379 0.186079 19.276524 9.498531 0.288968 0.029234 ... 0.300408 0 1 0 0 1 0 1.253868e+08 0 0
4 203.593486 234.605521 130.943969 1.880567 3.148556 0.856376 38.769820 19.361727 0.430714 0.021992 ... 0.065727 0 1 1 0 1 1 7.485480e+07 0 0

5 rows × 87 columns

In [15]:
# do standard scaler for rest_cols
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

df_box_cox_woo[rest_cols] = scaler.fit_transform(df_box_cox_woo[rest_cols])

# histogram of rest_cols
df_box_cox_woo[rest_cols].hist(bins=40, figsize=(20, 20))
plt.tight_layout()
plt.show()

Model Comparison - datasets¶

  • Box Cox with outliers
  • Box Cox without outliers manually detected
  • Box Cox without outliers PyOD detected
  • Raw with outliers
  • Raw without outliers PyOD detected
  • Raw without outliers manually detected

Which Metric to Use?¶

  • Accuracy is overall correct
  • Recall is good for us because we may want to catch all the defaults (people who will not pay back the loan)
  • Precision is good for us because we want to minimize the number of false positives (people who are not defaulting but we predict they will) in some cases it is more important than recall
  • It depends on what we want to achieve

Preliminary modeling¶

We have:

  1. raw data
  2. after box cox
  3. with/without outliers
In [16]:
# scores for every method dataframe - cross validation
scores = pd.DataFrame(columns=['method', 'accuracy_type', 'accuracy_score', 'variance'])

We will try different modells on our dfs (after/before deleting outliers and using box cox) to check if our data preprocessing gives us better or worse results.

  1. Box Cox with outliers:
In [17]:
target="DEFAULT"
df_train=pd.read_csv('../data/for_modelling/credit_score_train.csv')
df_val=pd.read_csv('../data/for_modelling/credit_score_valid.csv')
# transform both train and valid datasets (BOX COX and Standard Scaler)
df_train2=df_train.copy()
df_val2=df_val.copy()
for col in rest_cols:
    df_train2[col] = boxcox(df_train2[col]+1)[0]
    df_val2[col] = boxcox(df_val2[col]+1)[0]
df_train2[rest_cols] = scaler.fit_transform(df_train2[rest_cols])
df_val2[rest_cols] = scaler.transform(df_val2[rest_cols])
# map CAT_GAMBLING (No-0, Low-1, High-2)
df_train2['CAT_GAMBLING'] = df_train2['CAT_GAMBLING'].map({'No':0, 'Low':1, 'High':2})
df_val2['CAT_GAMBLING'] = df_val2['CAT_GAMBLING'].map({'No':0, 'Low':1, 'High':2})
In [18]:
X=df_train2.drop(target, axis=1)
y=df_train2[target]

Dummy Classifier

In [19]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
# precision_score, recall_score, f1_score
from sklearn.metrics import precision_score, recall_score, f1_score
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X, y)
y_pred = dummy_clf.predict(df_val2.drop(target, axis=1))
acc=accuracy_score(df_val2[target], y_pred)
df_row=pd.DataFrame({'method':'Dummy_most_frequent', 'accuracy_type':'accuracy', 'accuracy_score':acc, 'variance':0}, index=[0])
scores=pd.concat([scores, df_row])
prec=precision_score(df_val2[target], y_pred)
df_row=pd.DataFrame({'method':'Dummy_most_frequent', 'accuracy_type':'precision', 'accuracy_score':prec, 'variance':0}, index=[0])
scores=pd.concat([scores, df_row])
rec=recall_score(df_val2[target], y_pred)
df_row=pd.DataFrame({'method':'Dummy_most_frequent', 'accuracy_type':'recall', 'accuracy_score':rec, 'variance':0}, index=[0])
scores=pd.concat([scores, df_row])
f1=f1_score(df_val2[target], y_pred)
df_row=pd.DataFrame({'method':'Dummy_most_frequent', 'accuracy_type':'f1', 'accuracy_score':f1, 'variance':0}, index=[0])
scores=pd.concat([scores, df_row])
dummy_clf = DummyClassifier(strategy="constant", constant=1)
dummy_clf.fit(X, y)
y_pred = dummy_clf.predict(df_val2.drop(target, axis=1))
acc=accuracy_score(df_val2[target], y_pred)
df_row=pd.DataFrame({'method':'Dummy_constant_1', 'accuracy_type':'accuracy', 'accuracy_score':acc, 'variance':0}, index=[0])
scores=pd.concat([scores, df_row])
prec=precision_score(df_val2[target], y_pred)
df_row=pd.DataFrame({'method':'Dummy_constant_1', 'accuracy_type':'precision', 'accuracy_score':prec, 'variance':0}, index=[0])
scores=pd.concat([scores, df_row])
rec=recall_score(df_val2[target], y_pred)
df_row=pd.DataFrame({'method':'Dummy_constant_1', 'accuracy_type':'recall', 'accuracy_score':rec, 'variance':0}, index=[0])
scores=pd.concat([scores, df_row])
f1=f1_score(df_val2[target], y_pred)
df_row=pd.DataFrame({'method':'Dummy_constant_1', 'accuracy_type':'f1', 'accuracy_score':f1, 'variance':0}, index=[0])
scores=pd.concat([scores, df_row])
scores
/opt/homebrew/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
Out[19]:
method accuracy_type accuracy_score variance
0 Dummy_most_frequent accuracy 0.717391 0
0 Dummy_most_frequent precision 0.000000 0
0 Dummy_most_frequent recall 0.000000 0
0 Dummy_most_frequent f1 0.000000 0
0 Dummy_constant_1 accuracy 0.282609 0
0 Dummy_constant_1 precision 0.282609 0
0 Dummy_constant_1 recall 1.000000 0
0 Dummy_constant_1 f1 0.440678 0

SVC model

In [22]:
# SVC model
from sklearn.svm import SVC # another kernels than rbf and poly with 10 degrees are useless
# import classification report
# igonre wornings
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import classification_report
model = SVC(random_state=42, kernel='poly')
model.fit(X, y)
y_pred = model.predict(df_val2.drop(target, axis=1))
report=classification_report(df_val2[target], y_pred)
print(report)
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        99
           1       0.28      1.00      0.44        39

    accuracy                           0.28       138
   macro avg       0.14      0.50      0.22       138
weighted avg       0.08      0.28      0.12       138
In [23]:
model = SVC(random_state=42, kernel='linear')
model.fit(X, y)
y_pred = model.predict(df_val2.drop(target, axis=1))
report=classification_report(df_val2[target], y_pred)
print(report)
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        99
           1       0.28      1.00      0.44        39

    accuracy                           0.28       138
   macro avg       0.14      0.50      0.22       138
weighted avg       0.08      0.28      0.12       138
In [24]:
model= SVC(gamma=2, C=1, random_state=42)
model.fit(X, y)
y_pred = model.predict(df_val2.drop(target, axis=1))
report=classification_report(df_val2[target], y_pred)
print(report)
              precision    recall  f1-score   support

           0       0.72      1.00      0.84        99
           1       0.00      0.00      0.00        39

    accuracy                           0.72       138
   macro avg       0.36      0.50      0.42       138
weighted avg       0.51      0.72      0.60       138
In [25]:
model_rbf = SVC(random_state=42, kernel='rbf')
model_rbf.fit(X, y)
y_pred = model_rbf.predict(df_val2.drop(target, axis=1))
report=classification_report(df_val2[target], y_pred)
print(report)
              precision    recall  f1-score   support

           0       0.72      1.00      0.84        99
           1       0.00      0.00      0.00        39

    accuracy                           0.72       138
   macro avg       0.36      0.50      0.42       138
weighted avg       0.51      0.72      0.60       138
In [26]:
model_poly = SVC(random_state=42, kernel='poly', degree=10)
model_poly.fit(X, y)
y_pred = model_poly.predict(df_val2.drop(target, axis=1))
report=classification_report(df_val2[target], y_pred)
print(report)
              precision    recall  f1-score   support

           0       0.72      1.00      0.84        99
           1       0.00      0.00      0.00        39

    accuracy                           0.72       138
   macro avg       0.36      0.50      0.42       138
weighted avg       0.51      0.72      0.60       138

Other kernels than rbf and poly with 10 degrees are useless¶

  • we take into account only rbf and poly with 10 degrees
  • cross validation for rbf and poly with 10 degrees
In [27]:
from sklearn.model_selection import cross_val_score
df_cross_val = pd.concat([df_train2, df_val2])
X=df_cross_val.drop(target, axis=1)
y=df_cross_val[target]
rbf_scores=cross_val_score(model_rbf, X, y, cv=10, scoring='accuracy')
df_row=pd.DataFrame({'method':'SVC_rbf_box_cox_with_outliers', 'accuracy_type':'accuracy', 'accuracy_score':rbf_scores.mean(), 'variance':rbf_scores.var()}, index=[0])
scores=pd.concat([scores, df_row])
rbf_scores=cross_val_score(model_rbf, X, y, cv=10, scoring="precision")
df_row=pd.DataFrame({'method':'SVC_rbf_box_cox_with_outliers', 'accuracy_type':'precision', 'accuracy_score':rbf_scores.mean(), 'variance':rbf_scores.var()}, index=[0])
scores=pd.concat([scores, df_row])
rbf_scores=cross_val_score(model_rbf, X, y, cv=10, scoring="recall")
df_row=pd.DataFrame({'method':'SVC_rbf_box_cox_with_outliers', 'accuracy_type':'recall', 'accuracy_score':rbf_scores.mean(), 'variance':rbf_scores.var()}, index=[0])
scores=pd.concat([scores, df_row])
rbf_scores=cross_val_score(model_rbf, X, y, cv=10, scoring="f1")
df_row=pd.DataFrame({'method':'SVC_rbf_box_cox_with_outliers', 'accuracy_type':'f1', 'accuracy_score':rbf_scores.mean(), 'variance':rbf_scores.var()}, index=[0])
scores=pd.concat([scores, df_row])
scores
Out[27]:
method accuracy_type accuracy_score variance
0 Dummy_most_frequent accuracy 0.717391 0
0 Dummy_most_frequent precision 0.000000 0
0 Dummy_most_frequent recall 0.000000 0
0 Dummy_most_frequent f1 0.000000 0
0 Dummy_constant_1 accuracy 0.282609 0
0 Dummy_constant_1 precision 0.282609 0
0 Dummy_constant_1 recall 1.000000 0
0 Dummy_constant_1 f1 0.440678 0
0 SVC_rbf_box_cox_with_outliers accuracy 0.716590 0.000021
0 SVC_rbf_box_cox_with_outliers precision 0.000000 0.0
0 SVC_rbf_box_cox_with_outliers recall 0.000000 0.0
0 SVC_rbf_box_cox_with_outliers f1 0.000000 0.0

Random Forest model

In [28]:
# Random Forest model
from sklearn.ensemble import RandomForestClassifier
model_random_forest = RandomForestClassifier(random_state=42)
model_random_forest.fit(X, y)
for type in ['accuracy', 'precision', 'recall', 'f1']:
    random_forest_scores=cross_val_score(model_random_forest, X, y, cv=10, scoring=type)
    df_row=pd.DataFrame({'method':'Random_Forest_box_cox_with_outliers', 'accuracy_type':type, 'accuracy_score':random_forest_scores.mean(), 'variance':random_forest_scores.var()}, index=[0])
    scores=pd.concat([scores, df_row])
# Decision Tree model
from sklearn.tree import DecisionTreeClassifier
model_decision_tree = DecisionTreeClassifier(random_state=42, max_depth=5)
model_decision_tree.fit(X, y)
for type in ['accuracy', 'precision', 'recall', 'f1']:
    decision_tree_scores=cross_val_score(model_decision_tree, X, y, cv=10, scoring=type)
    df_row=pd.DataFrame({'method':'Decision_Tree_box_cox_with_outliers', 'accuracy_type':type, 'accuracy_score':decision_tree_scores.mean(), 'variance':decision_tree_scores.var()}, index=[0])
    scores=pd.concat([scores, df_row])
# ADA BOOST model
from sklearn.ensemble import AdaBoostClassifier
model_ada_boost = AdaBoostClassifier(random_state=42, algorithm='SAMME')
model_ada_boost.fit(X, y)
for type in ['accuracy', 'precision', 'recall', 'f1']:
    ada_boost_scores=cross_val_score(model_ada_boost, X, y, cv=10, scoring=type)
    df_row=pd.DataFrame({'method':'Ada_Boost_box_cox_with_outliers', 'accuracy_type':type, 'accuracy_score':ada_boost_scores.mean(), 'variance':ada_boost_scores.var()}, index=[0])
    scores=pd.concat([scores, df_row])
# Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB
model_gaussian_nb = GaussianNB()
model_gaussian_nb.fit(X, y)
for type in ['accuracy', 'precision', 'recall', 'f1']:
    gaussian_nb_scores=cross_val_score(model_gaussian_nb, X, y, cv=10, scoring=type)
    df_row=pd.DataFrame({'method':'Gaussian_NB_box_cox_with_outliers', 'accuracy_type':type, 'accuracy_score':gaussian_nb_scores.mean(), 'variance':gaussian_nb_scores.var()}, index=[0])
    scores=pd.concat([scores, df_row])
# Neural Network model
from sklearn.neural_network import MLPClassifier
model_neural_network = MLPClassifier(random_state=42, max_iter=1000)
model_neural_network.fit(X, y)
for type in ['accuracy', 'precision', 'recall', 'f1']:
    neural_network_scores=cross_val_score(model_neural_network, X, y, cv=10, scoring=type)
    df_row=pd.DataFrame({'method':'Neural_Network_box_cox_with_outliers', 'accuracy_type':type, 'accuracy_score':neural_network_scores.mean(), 'variance':neural_network_scores.var()}, index=[0])
    scores=pd.concat([scores, df_row])
# logistic regression model
from sklearn.linear_model import LogisticRegression
model_logistic_regression = LogisticRegression(random_state=42)
model_logistic_regression.fit(X, y)
for type in ['accuracy', 'precision', 'recall', 'f1']:
    logistic_regression_scores=cross_val_score(model_logistic_regression, X, y, cv=10, scoring=type)
    df_row=pd.DataFrame({'method':'Logistic_Regression_box_cox_with_outliers', 'accuracy_type':type, 'accuracy_score':logistic_regression_scores.mean(), 'variance':logistic_regression_scores.var()}, index=[0])
    scores=pd.concat([scores, df_row])
# Quadratic Discriminant Analysis model
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
model_qda = QuadraticDiscriminantAnalysis()
model_qda.fit(X, y)
for type in ['accuracy', 'precision', 'recall', 'f1']:
    qda_scores=cross_val_score(model_qda, X, y, cv=10, scoring=type)
    df_row=pd.DataFrame({'method':'QDA_box_cox_with_outliers', 'accuracy_type':type, 'accuracy_score':qda_scores.mean(), 'variance':qda_scores.var()}, index=[0])
    scores=pd.concat([scores, df_row])
# Stochastic Gradient Descent model
from sklearn.linear_model import SGDClassifier
model_sgd = SGDClassifier(random_state=42)
model_sgd.fit(X, y)
for type in ['accuracy', 'precision', 'recall', 'f1']:
    sgd_scores=cross_val_score(model_sgd, X, y, cv=10, scoring=type)
    df_row=pd.DataFrame({'method':'SGD_box_cox_with_outliers', 'accuracy_type':type, 'accuracy_score':sgd_scores.mean(), 'variance':sgd_scores.var()}, index=[0])
    scores=pd.concat([scores, df_row])

scores.sort_values(by='accuracy_score', ascending=False)
Out[28]:
method accuracy_type accuracy_score variance
0 Dummy_constant_1 recall 1.000000 0
0 Ada_Boost_box_cox_with_outliers accuracy 0.722785 0.001157
0 Random_Forest_box_cox_with_outliers accuracy 0.719611 0.001203
0 Dummy_most_frequent accuracy 0.717391 0
0 SVC_rbf_box_cox_with_outliers accuracy 0.716590 0.000021
0 Decision_Tree_box_cox_with_outliers accuracy 0.711802 0.00269
0 Logistic_Regression_box_cox_with_outliers accuracy 0.699078 0.001829
0 Gaussian_NB_box_cox_with_outliers recall 0.696732 0.008663
0 Neural_Network_box_cox_with_outliers accuracy 0.666667 0.018147
0 QDA_box_cox_with_outliers recall 0.608824 0.03428
0 SGD_box_cox_with_outliers recall 0.566667 0.065309
0 Gaussian_NB_box_cox_with_outliers accuracy 0.561777 0.003879
0 Ada_Boost_box_cox_with_outliers precision 0.547251 0.021315
0 Decision_Tree_box_cox_with_outliers precision 0.540650 0.059391
0 Random_Forest_box_cox_with_outliers precision 0.538889 0.090094
0 SGD_box_cox_with_outliers accuracy 0.524680 0.019717
0 QDA_box_cox_with_outliers accuracy 0.480927 0.000824
0 Logistic_Regression_box_cox_with_outliers precision 0.475577 0.068937
0 Gaussian_NB_box_cox_with_outliers f1 0.474320 0.002451
0 Dummy_constant_1 f1 0.440678 0
0 Neural_Network_box_cox_with_outliers precision 0.408558 0.038278
0 SGD_box_cox_with_outliers f1 0.390586 0.008212
0 QDA_box_cox_with_outliers f1 0.388818 0.00778
0 Gaussian_NB_box_cox_with_outliers precision 0.362814 0.001931
0 SGD_box_cox_with_outliers precision 0.325194 0.005431
0 Ada_Boost_box_cox_with_outliers f1 0.304584 0.009312
0 Neural_Network_box_cox_with_outliers f1 0.289932 0.020399
0 Decision_Tree_box_cox_with_outliers f1 0.289495 0.012696
0 Neural_Network_box_cox_with_outliers recall 0.288889 0.068395
0 QDA_box_cox_with_outliers precision 0.288568 0.00271
0 Dummy_constant_1 precision 0.282609 0
0 Dummy_constant_1 accuracy 0.282609 0
0 Random_Forest_box_cox_with_outliers f1 0.248681 0.011505
0 Ada_Boost_box_cox_with_outliers recall 0.219281 0.00589
0 Logistic_Regression_box_cox_with_outliers f1 0.214305 0.011297
0 Decision_Tree_box_cox_with_outliers recall 0.212092 0.008908
0 Random_Forest_box_cox_with_outliers recall 0.167320 0.005487
0 Logistic_Regression_box_cox_with_outliers recall 0.150980 0.006862
0 Dummy_most_frequent recall 0.000000 0
0 Dummy_most_frequent f1 0.000000 0
0 Dummy_most_frequent precision 0.000000 0
0 SVC_rbf_box_cox_with_outliers recall 0.000000 0.0
0 SVC_rbf_box_cox_with_outliers f1 0.000000 0.0
0 SVC_rbf_box_cox_with_outliers precision 0.000000 0.0
In [29]:
import seaborn as sns
sns.barplot(x='accuracy_score', y='method', data=scores, hue='accuracy_type')
Out[29]:
<Axes: xlabel='accuracy_score', ylabel='method'>

Everything here is totally useless I think, because Dummy Classifier is one of the best models.

  1. Box Cox without outliers (we'll delete outliers for train df)
In [30]:
# Definiowanie granic dla outlierów
outliers_dict = {
    'T_CLOTHING_6': 37000,
    'R_CLOTHING_SAVINGS': 1.50,
    'R_CLOTHING_DEBT': 1.0,
    'R_EDUCATION_SAVINGS': 1.7,
    'R_EDUCATION_DEBT': 0.3,
    'R_ENTERTAINMENT_INCOME': 1.2,
    'R_ENTERTAINMENT_SAVINGS': 6,
    'R_ENTERTAINMENT_DEBT': 2.0,
    'R_FINES_INCOME': 0.02,
    'R_FINES_SAVINGS': 0.05,
    'R_FINES_DEBT': 0.02,
    'R_GAMBLING_INCOME': 0.2,
    'R_GAMBLING_SAVINGS': 0.8,
    'R_GAMBLING_DEBT': 0.15,
    'R_GROCERIES_SAVINGS': 3.5,
    'T_HEALTH_12': 40000,
    'T_HEALTH_6': 25000,
    'R_HEALTH_INCOME': 0.3,
    'R_HEALTH_SAVINGS': 0.8,
    'R_HOUSING_DEBT': 3,
    'R_TAX_DEBT': 0.15
}

for col, threshold in outliers_dict.items():
    df_without_outliers_bc_train = df_train2[df_train2[col] <= threshold]

df_without_outliers_bc_train.reset_index(drop=True, inplace=True)
df_without_outliers_bc_train
Out[30]:
INCOME SAVINGS DEBT R_SAVINGS_INCOME R_DEBT_INCOME R_DEBT_SAVINGS T_CLOTHING_12 T_CLOTHING_6 R_CLOTHING R_CLOTHING_INCOME ... R_EXPENDITURE_SAVINGS R_EXPENDITURE_DEBT CAT_GAMBLING CAT_DEBT CAT_CREDIT_CARD CAT_MORTGAGE CAT_SAVINGS_ACCOUNT CAT_DEPENDENTS CREDIT_SCORE DEFAULT
0 0.619996 0.577458 1.439124 0.130956 1.553292 0.832731 -0.047917 0.356240 0.956257 -1.121007 ... -0.260731 -1.127503 0 1 1 1 1 1 -1.130432 0
1 0.817704 1.623598 1.377102 0.985366 1.247825 0.051701 0.205790 0.303502 -0.069508 -0.893278 ... -0.952011 -1.007798 0 1 1 0 1 1 -0.680778 0
2 0.595037 -0.020955 0.861507 -0.544491 0.728279 0.852186 0.527802 0.433736 -0.397748 0.076292 ... 0.413988 -0.794487 0 1 0 0 1 0 -0.466063 0
3 0.639110 -0.470106 0.824800 -1.091724 0.628010 1.269785 1.436630 1.621757 1.021196 1.657935 ... 0.873646 -0.932200 0 1 1 0 1 0 -0.173222 1
4 0.773380 1.625310 0.676118 1.027970 0.279616 -0.558965 0.521028 0.640860 0.198068 -0.220086 ... -0.978246 -0.382531 2 1 0 0 1 1 -0.695687 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
308 -0.805311 0.073590 -1.080262 1.349440 -1.176479 -1.279739 -0.860047 -0.308425 1.727215 -0.651681 ... -1.154654 1.536221 1 1 0 0 1 0 -0.305739 0
309 -0.967598 -1.220451 0.160487 -0.798347 1.916246 1.615881 -0.612525 -0.243081 0.582913 0.775811 ... 1.065847 -1.153746 0 1 0 0 1 0 -2.176996 0
310 0.042250 1.340068 -0.367899 1.544719 -0.702088 -1.180194 -0.326573 -0.258194 -0.478309 -0.908676 ... -1.244763 0.817954 0 1 0 0 1 0 0.684931 0
311 0.840432 1.250887 1.106586 0.624554 0.841425 0.047313 0.587031 0.645190 0.052445 -0.191677 ... -0.775283 -0.860543 2 1 0 1 1 1 -0.370573 0
312 0.420570 -0.660180 0.396702 -1.175901 0.184151 1.095918 0.951649 -0.760120 -1.909047 1.238277 ... 1.280243 -0.382531 2 1 0 0 1 0 0.284337 0

313 rows × 86 columns

In [31]:
X2=df_without_outliers_bc_train.drop(target, axis=1)
y2= df_without_outliers_bc_train[target]

SVC

In [32]:
# SVC model
from sklearn.svm import SVC # another kernels than rbf and poly with 10 degrees are useless
from sklearn.metrics import accuracy_score
model = SVC(random_state=42, kernel='poly', degree=10)
model.fit(X2, y2)
y_pred = model.predict(df_val2.drop(target, axis=1))
raport=classification_report(df_val2[target], y_pred)
print(raport)
              precision    recall  f1-score   support

           0       0.72      1.00      0.84        99
           1       0.00      0.00      0.00        39

    accuracy                           0.72       138
   macro avg       0.36      0.50      0.42       138
weighted avg       0.51      0.72      0.60       138

Random Forest model

In [33]:
# Random Forest model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X2, y2)
y_pred = model.predict(df_val2.drop(target, axis=1))
raport=classification_report(df_val2[target], y_pred)
print(raport)
              precision    recall  f1-score   support

           0       0.72      0.23      0.35        99
           1       0.28      0.77      0.41        39

    accuracy                           0.38       138
   macro avg       0.50      0.50      0.38       138
weighted avg       0.60      0.38      0.37       138
  1. Raw with outliers
In [34]:
X=df_train.drop(target, axis=1)
X_val=df_val.drop(target, axis=1)
X_val['CAT_GAMBLING'] = X_val['CAT_GAMBLING'].map({'No':0, 'Low':1, 'High':2})
# map CAT_GAMBLING (No-0, Low-1, High-2)
X['CAT_GAMBLING'] = X['CAT_GAMBLING'].map({'No':0, 'Low':1, 'High':2})
y=df_train[target]

SVC

In [35]:
# SVC model
from sklearn.svm import SVC # another kernels than rbf and poly with 10 degrees are useless
from sklearn.metrics import accuracy_score
model = SVC(random_state=42, kernel='poly', degree=10)
model.fit(X, y)
y_pred = model.predict(df_val2.drop(target, axis=1))
raport=classification_report(df_val2[target], y_pred)
print(raport)
              precision    recall  f1-score   support

           0       0.72      1.00      0.84        99
           1       0.00      0.00      0.00        39

    accuracy                           0.72       138
   macro avg       0.36      0.50      0.42       138
weighted avg       0.51      0.72      0.60       138

Forest model

In [36]:
model = RandomForestClassifier(random_state=42)
model.fit(X, y)
y_pred = model.predict(X_val)
raport=classification_report(df_val[target], y_pred)
print(raport)
              precision    recall  f1-score   support

           0       0.75      0.90      0.82        99
           1       0.47      0.23      0.31        39

    accuracy                           0.71       138
   macro avg       0.61      0.56      0.56       138
weighted avg       0.67      0.71      0.67       138
  1. raw without outliers
In [37]:
# Definiowanie granic dla outlierów
outliers_dict = {
    'T_CLOTHING_6': 37000,
    'R_CLOTHING_SAVINGS': 1.50,
    'R_CLOTHING_DEBT': 1.0,
    'R_EDUCATION_SAVINGS': 1.7,
    'R_EDUCATION_DEBT': 0.3,
    'R_ENTERTAINMENT_INCOME': 1.2,
    'R_ENTERTAINMENT_SAVINGS': 6,
    'R_ENTERTAINMENT_DEBT': 2.0,
    'R_FINES_INCOME': 0.02,
    'R_FINES_SAVINGS': 0.05,
    'R_FINES_DEBT': 0.02,
    'R_GAMBLING_INCOME': 0.2,
    'R_GAMBLING_SAVINGS': 0.8,
    'R_GAMBLING_DEBT': 0.15,
    'R_GROCERIES_SAVINGS': 3.5,
    'T_HEALTH_12': 40000,
    'T_HEALTH_6': 25000,
    'R_HEALTH_INCOME': 0.3,
    'R_HEALTH_SAVINGS': 0.8,
    'R_HOUSING_DEBT': 3,
    'R_TAX_DEBT': 0.15
}

for col, threshold in outliers_dict.items():
    df_without_outliers_train = df_train[df_train[col] <= threshold]

df_without_outliers_train.reset_index(drop=True, inplace=True)
df_without_outliers_train
Out[37]:
INCOME SAVINGS DEBT R_SAVINGS_INCOME R_DEBT_INCOME R_DEBT_SAVINGS T_CLOTHING_12 T_CLOTHING_6 R_CLOTHING R_CLOTHING_INCOME ... R_EXPENDITURE_SAVINGS R_EXPENDITURE_DEBT CAT_GAMBLING CAT_DEBT CAT_CREDIT_CARD CAT_MORTGAGE CAT_SAVINGS_ACCOUNT CAT_DEPENDENTS CREDIT_SCORE DEFAULT
0 2783 1855 0 0.6665 0.0000 0.0000 103 74 0.7184 0.0370 ... 2.5003 0.0000 No 0 0 0 1 0 570 0
1 314430 445442 707468 1.4167 2.2500 1.5882 35861 29157 0.8131 0.1141 ... 0.5882 0.3704 High 1 0 1 1 0 691 0
2 161773 517674 2782496 3.2000 17.2000 5.3750 3716 2533 0.6816 0.0230 ... 0.3125 0.0581 No 1 1 1 1 1 520 0
3 16014 97685 20818 6.1000 1.3000 0.2131 637 187 0.2936 0.0398 ... 0.1639 0.7692 No 1 0 0 1 0 654 0
4 193225 1410542 2589215 7.3000 13.4000 1.8356 5276 2325 0.4407 0.0273 ... 0.1370 0.0746 No 1 1 0 1 1 552 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
484 20007 22508 457665 1.1250 22.8752 20.3334 1451 861 0.5934 0.0725 ... 1.1111 0.0546 No 1 0 0 1 0 423 0
485 20846 268679 23162 12.8888 1.1111 0.0862 1302 533 0.4094 0.0625 ... 0.0862 1.0000 Low 1 0 0 1 0 584 0
486 89235 1106514 160623 12.4000 1.8000 0.1452 2413 835 0.3460 0.0270 ... 0.0806 0.5556 No 1 0 0 1 0 631 0
487 197073 1021195 1863233 5.1818 9.4545 1.8246 8416 3948 0.4691 0.0427 ... 0.1754 0.0962 High 1 0 1 1 1 572 0
488 133592 85013 680103 0.6364 5.0909 8.0000 12476 264 0.0212 0.0934 ... 1.4286 0.1786 High 1 0 0 1 0 610 0

489 rows × 86 columns

In [38]:
X3=df_without_outliers_train.drop(target, axis=1)
y3= df_without_outliers_train[target]

X3['CAT_GAMBLING'] = X3['CAT_GAMBLING'].map({'No':0, 'Low':1, 'High':2})

SVC

In [39]:
# SVC model
from sklearn.svm import SVC # another kernels than rbf and poly with 10 degrees are useless
from sklearn.metrics import accuracy_score
model = SVC(random_state=42, kernel='poly', degree=10)
model.fit(X3, y3)
y_pred = model.predict(df_val2.drop(target, axis=1))
raport=classification_report(df_val2[target], y_pred)
print(raport)
              precision    recall  f1-score   support

           0       0.72      1.00      0.84        99
           1       0.00      0.00      0.00        39

    accuracy                           0.72       138
   macro avg       0.36      0.50      0.42       138
weighted avg       0.51      0.72      0.60       138

Random Forest

In [40]:
model = RandomForestClassifier(random_state=42)
model.fit(X3, y3)
y_pred = model.predict(X_val)
raport=classification_report(df_val[target], y_pred)
print(raport)
              precision    recall  f1-score   support

           0       0.74      0.90      0.81        99
           1       0.41      0.18      0.25        39

    accuracy                           0.70       138
   macro avg       0.57      0.54      0.53       138
weighted avg       0.64      0.70      0.65       138

For SVC data processing and transformation doesn't change accuracy_score, with RandomForestClassifier the accuracy_score is the highest when withour transormation.

  1. COX BOX WITHOUT OUTLIERS DETECTED BY PYOD AND WITHOUT COLUMNS WITH HIGH CORRELATION
In [42]:
# columns to remove from EDA
columns_to_remove = ["T_CLOTHING_12","T_ENTERTAINMENT_12", "T_GROCERIES_12", "T_GROCERIES_6", "T_HEALTH_12", "T_TAX_12", "T_TAX_6", "T_TRAVEL_12", "T_TRAVEL_6","T_UTILITIES_12", "T_UTILITIES_6", "T_EXPENDITURE_12", "T_EXPENDITURE_6"]
columns_to_remove
Out[42]:
['T_CLOTHING_12',
 'T_ENTERTAINMENT_12',
 'T_GROCERIES_12',
 'T_GROCERIES_6',
 'T_HEALTH_12',
 'T_TAX_12',
 'T_TAX_6',
 'T_TRAVEL_12',
 'T_TRAVEL_6',
 'T_UTILITIES_12',
 'T_UTILITIES_6',
 'T_EXPENDITURE_12',
 'T_EXPENDITURE_6']
In [49]:
df_train=pd.read_csv('../data/for_modelling/credit_score_train.csv')
df_val=pd.read_csv('../data/for_modelling/credit_score_valid.csv')
# map CAT_GAMBLING (No-0, Low-1, High-2)
df_train['CAT_GAMBLING'] = df_train['CAT_GAMBLING'].map({'No':0, 'Low':1, 'High':2})
df_val['CAT_GAMBLING'] = df_val['CAT_GAMBLING'].map({'No':0, 'Low':1, 'High':2})
# remove columns with high correlation
df_train.drop(columns=columns_to_remove, inplace=True)
df_val.drop(columns=columns_to_remove, inplace=True)
# remove outliers
from pyod.models.knn import KNN
clf = KNN(contamination=0.04)
rest_cols = [col for col in df_train.columns if col not in columns_to_remove]
# remove DEFAULT column
rest_cols.remove('DEFAULT')
clf.fit(df_train[rest_cols])
df_train['outliers'] = clf.labels_
# sum of outliers
df_train['outliers'].value_counts()
Out[49]:
outliers
0    470
1     20
Name: count, dtype: int64
In [50]:
df_train = df_train[df_train['outliers'] == 0]
df_train.drop(columns='outliers', inplace=True)
df_train.reset_index(drop=True, inplace=True)
df_train
Out[50]:
INCOME SAVINGS DEBT R_SAVINGS_INCOME R_DEBT_INCOME R_DEBT_SAVINGS T_CLOTHING_6 R_CLOTHING R_CLOTHING_INCOME R_CLOTHING_SAVINGS ... R_EXPENDITURE_SAVINGS R_EXPENDITURE_DEBT CAT_GAMBLING CAT_DEBT CAT_CREDIT_CARD CAT_MORTGAGE CAT_SAVINGS_ACCOUNT CAT_DEPENDENTS CREDIT_SCORE DEFAULT
0 2783 1855 0 0.6665 0.0000 0.0000 74 0.7184 0.0370 0.0555 ... 2.5003 0.0000 0 0 0 0 1 0 570 0
1 314430 445442 707468 1.4167 2.2500 1.5882 29157 0.8131 0.1141 0.0805 ... 0.5882 0.3704 2 1 0 1 1 0 691 0
2 161773 517674 2782496 3.2000 17.2000 5.3750 2533 0.6816 0.0230 0.0072 ... 0.3125 0.0581 0 1 1 1 1 1 520 0
3 16014 97685 20818 6.1000 1.3000 0.2131 187 0.2936 0.0398 0.0065 ... 0.1639 0.7692 0 1 0 0 1 0 654 0
4 193225 1410542 2589215 7.3000 13.4000 1.8356 2325 0.4407 0.0273 0.0037 ... 0.1370 0.0746 0 1 1 0 1 1 552 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
465 20007 22508 457665 1.1250 22.8752 20.3334 861 0.5934 0.0725 0.0645 ... 1.1111 0.0546 0 1 0 0 1 0 423 0
466 20846 268679 23162 12.8888 1.1111 0.0862 533 0.4094 0.0625 0.0048 ... 0.0862 1.0000 1 1 0 0 1 0 584 0
467 89235 1106514 160623 12.4000 1.8000 0.1452 835 0.3460 0.0270 0.0022 ... 0.0806 0.5556 0 1 0 0 1 0 631 0
468 197073 1021195 1863233 5.1818 9.4545 1.8246 3948 0.4691 0.0427 0.0082 ... 0.1754 0.0962 2 1 0 1 1 1 572 0
469 133592 85013 680103 0.6364 5.0909 8.0000 264 0.0212 0.0934 0.1468 ... 1.4286 0.1786 2 1 0 0 1 0 610 0

470 rows × 73 columns

In [51]:
X=df_train.drop(target, axis=1)
y=df_train[target]
X_val=df_val.drop(target, axis=1)
y_val=df_val[target]
# Box Cox and standart scalling
for col in rest_cols:
    X[col] = boxcox(X[col]+1)[0]
    X_val[col] = boxcox(X_val[col]+1)[0]
X[rest_cols] = scaler.fit_transform(X[rest_cols])
X_val[rest_cols] = scaler.transform(X_val[rest_cols])

Let's check the models

In [57]:
score=pd.DataFrame(columns=['method', 'accuracy_type', 'accuracy_score', 'variance'])

names=[
    'SVC',
    'Random_Forest',
    'Decision_Tree',
    'Ada_Boost',
    'Gaussian_NB',
    'Neural_Network',
    'Logistic_Regression',
    'QDA',
    'SGD',
    'Dummy_most_frequent',
]

classifiers=[
    SVC(random_state=42, kernel='poly', degree=10),
    RandomForestClassifier(random_state=42),
    DecisionTreeClassifier(random_state=42, max_depth=5),
    AdaBoostClassifier(random_state=42, algorithm='SAMME'),
    GaussianNB(),
    MLPClassifier(random_state=42, max_iter=1000),
    LogisticRegression(random_state=42),
    QuadraticDiscriminantAnalysis(),
    SGDClassifier(random_state=42),
    DummyClassifier(strategy="most_frequent")
]

for name, classifier in zip(names, classifiers):
    classifier.fit(X, y)
    for type in ['accuracy', 'precision', 'recall', 'f1']:
        scores=cross_val_score(classifier, pd.concat([X,X_val]), pd.concat([y, y_val]), cv=10, scoring=type)
        df_row=pd.DataFrame({'method':name, 'accuracy_type':type, 'accuracy_score':scores.mean(), 'variance':scores.var()}, index=[0])
        score=pd.concat([score, df_row])
score.sort_values(by='accuracy_score', ascending=False)
Out[57]:
method accuracy_type accuracy_score variance
0 Dummy_most_frequent accuracy 0.717104 0.000040
0 Neural_Network accuracy 0.713825 0.000755
0 Random_Forest accuracy 0.713798 0.001097
0 Logistic_Regression accuracy 0.708743 0.001495
0 Ada_Boost accuracy 0.707186 0.000773
0 SVC accuracy 0.705410 0.001291
0 Decision_Tree accuracy 0.697240 0.001437
0 Gaussian_NB recall 0.678758 0.008176
0 Gaussian_NB accuracy 0.573689 0.003685
0 QDA recall 0.556863 0.050690
0 QDA accuracy 0.537869 0.003529
0 SGD recall 0.530719 0.064501
0 Random_Forest precision 0.514762 0.056325
0 SGD accuracy 0.494372 0.012958
0 Ada_Boost precision 0.484300 0.022240
0 Gaussian_NB f1 0.474327 0.001995
0 Logistic_Regression precision 0.458651 0.022632
0 Decision_Tree precision 0.455476 0.021991
0 Neural_Network precision 0.392858 0.055516
0 QDA f1 0.387052 0.016288
0 Gaussian_NB precision 0.368165 0.001808
0 SGD f1 0.358272 0.006583
0 Logistic_Regression f1 0.312510 0.016061
0 QDA precision 0.303459 0.006640
0 SGD precision 0.287123 0.003473
0 Decision_Tree f1 0.277330 0.005268
0 Neural_Network f1 0.276050 0.029940
0 Logistic_Regression recall 0.243137 0.012859
0 Ada_Boost f1 0.237586 0.006305
0 Random_Forest f1 0.229153 0.007249
0 Neural_Network recall 0.225163 0.024914
0 Decision_Tree recall 0.208497 0.004412
0 SVC precision 0.180000 0.100489
0 Ada_Boost recall 0.168627 0.005840
0 Random_Forest recall 0.150980 0.003522
0 SVC f1 0.091118 0.025519
0 SVC recall 0.087908 0.026491
0 Dummy_most_frequent precision 0.000000 0.000000
0 Dummy_most_frequent recall 0.000000 0.000000
0 Dummy_most_frequent f1 0.000000 0.000000
In [58]:
import seaborn as sns
sns.barplot(x='accuracy_score', y='method', data=score, hue='accuracy_type')
Out[58]:
<Axes: xlabel='accuracy_score', ylabel='method'>

Conclusions¶

  • Naive Bayes is the best model because Recall is important for us.
  • Remove high correlation columns and outliers detected by PyOD is a good idea.
  • Currently we have bad results.